import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
plt.rcParams['figure.figsize'] = [12, 6]
plt.style.use('fivethirtyeight')
x = str
df = pd.read_csv('Core_Data/Film_Bang_Personnel_Master_Step_2.csv', dtype={'Trainee prof':x, '2020': x, '2019': x, '2018': x,
'2017': x, '2016': x, '2015': x, '2014': x, '2013': x, '2012': x, '2011': x, '2010': x, '2009': x,
'2008': x, '2007': x, '2006': x, '2005': x, '2004': x, '2003': x, '2002': x, '2001': x, '2000': x,
'1999': x, '1998': x, '1997': x, '1996': x, '1995': x, '1994': x, '1993': x, '1992': x, '1991': x,
'1990': x, '1989': x, '1988': x, '1987': x, '1986': x, '1984': x, '1982': x, '1981': x,
'1979': x, '1978': x, '1976': x})
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 300)
The code below brings basic analysis into one document and shows the output of various calculations. The anonymisation of the data has been carried out in a separate notebook so names, full postcodes, locations etc are removed. We're left with the first 3 characters of the 'last known' postcode where entries have one.
This notebook will have other subsidiary notebooks added to it over the next few days (3/10/20)
Index:
Number of Entries, Columns
df.shape
df.columns
# Display of sample of data
df[['Trainee prog', 'Gender', 'Role 1', 'Role 1 Category', 'Rural', 'No of Yrs']].head(20)
# plot_years = df.loc[:, '2020':'1976'].columns
# print(plot_years)
# Data
plot_years = [2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013,2012,2011,2010,2009,2008,2007,2006,2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996, 1995, 1994, 1993, 1992, 1991, 1990, 1989,1988,1987,1986,1984,1982,1981,1979,1978,1976]
totals = []
for column in df.loc[:, '2020':'1976']:
tally = df[column].value_counts(dropna=True)
totals.append(tally[0])
print(totals)
x_indexes = list(range(len(years_x)))
x_indexes.reverse()
plt.plot(x_indexes, totals, color='#2E0014', label='Total Entries')
plt.xlabel('Years - ommitting 77, 80, 83, 85')
plt.ylabel('Number of Entries')
plt.title('Basic Growth and Decline')
plt.xticks(ticks=x_indexes, labels=years_x, rotation=90)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Historical_Events/basic_growth_decline.png')
plt.show()
x_indexes = list(range(len(plot_years)))
x_indexes.reverse()
plt.plot(x_indexes, totals, color='#2E0014', label='Total Entries')
plt.xlabel('Years - ommitting 77, 80, 83, 85')
plt.ylabel('Number of Entries')
plt.title('Periods of Growth and Decline')
plt.xticks(ticks=x_indexes, labels=plot_years, rotation=90)
plt.axvspan(x_indexes[40], x_indexes[30], facecolor='orange', alpha=0.5, label='Development of Screen Industry in Scotland')
plt.axvspan(x_indexes[30], x_indexes[20], facecolor='magenta', alpha=0.5, label='Deregulation, changing technology, flexible working')
plt.axvspan(x_indexes[20], x_indexes[10], facecolor='blue', alpha=0.5, label='Decline of industry')
plt.axvspan(x_indexes[10], x_indexes[0], facecolor='green', alpha=0.5, label='Increased Production, New Broadcasters')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Historical_Events/timeframes.png')
plt.show()
width=0.8
plt.bar(plot_years, totals, color='magenta', width = width, label='Total Workers Per Year')
plt.xlabel('Years')
plt.ylabel('No. of Entries')
plt.title('Entries Per Year')
plt.xticks(ticks=plot_years, rotation=90)
plt.legend()
plt.tight_layout()
plt.savefig('Outputs/Basic/entries_per_year.png')
plt.show()
x_indexes = list(range(len(plot_years)))
x_indexes.reverse()
plt.plot(x_indexes, totals, color='#2E0014', label='Total Entries')
plt.xlabel('Years - ommitting 77, 80, 83, 85')
plt.ylabel('Number of Entries')
plt.title('Historical Events and Number of Entries')
plt.xticks(ticks=x_indexes, labels=plot_years, rotation=90)
plt.axvline(x=x_indexes[36], linewidth=2, color='blue', label='Channel 4 Starts')
plt.axvline(x=x_indexes[24], linewidth=2, color='orange', label='Trainspotting')
plt.axvline(x=x_indexes[21], linewidth=2, color='green', label='Tartan Shorts Launched')
plt.axvline(x=x_indexes[16], linewidth=2, color='cyan', label='STV Crisis')
plt.axvline(x=x_indexes[12], linewidth=2, color='red', label='Financial Crisis')
plt.axvline(x=x_indexes[1], linewidth=2, color='magenta', label='Film Bang Website Relaunched')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Historical_Events/workers_by_year_key_dates.png')
plt.show()
Figures for number of people in each department. Departments were allocated according to keywords found in role descriptions.
df['Role 1 Category'].value_counts()
Percentages for above figures
df['Role 1 Category'].value_counts(normalize=True).apply(lambda x: format((x*100), '.0f'))
# Shows figures for those whose roles are consistent over the 3 columns (including empty values for 2 and 3),
# and those where the roles differ by department
df['Has Consistent Role'].value_counts(dropna=False)
# Percentages for the above
df['Has Consistent Role'].value_counts(normalize=True, dropna=False).apply(lambda x: format((x*100), '.0f'))
filt = df.groupby(['Role 1 Category'])
filt['Has Consistent Role'].value_counts(dropna=False)
# Percentages of the above
filt = df.groupby(['Role 1 Category'])
filt['Has Consistent Role'].value_counts(normalize=True, dropna=False).apply(lambda x: format((x*100), '.0f'))
In the above data we can see the departments Direction and Producer show the greatest instances of people having different roles across Role Categories 1, 2 & 3
# Data
depts = df['Role 1 Category'].dropna().unique().tolist()
nums = df['Role 1 Category'].value_counts().tolist()
# Plot
width=0.8
plt.bar(depts, nums, color='magenta', width = width, label='Entries')
plt.xlabel('Depts')
plt.ylabel('No. Entries')
plt.title('Film Bang Department Count')
plt.xticks(ticks=depts, rotation=90)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Department_Stats/department_count.png')
plt.show()
filt = (df['Role 1 Category'] == 'Production')
production = df[filt]
#production
df_production = pd.DataFrame(production)
#print(df_production)
df_production.loc[production['Role 1'].str.contains('Unit|Director|ProductionAssistant|Engineer|Floor|Script|Autocue|Location|Property|Scout|Runner|Newcomer|Production Assistant|Stagehand|Stage Hand|Studio|Production Executive|Drapes'), 'Role 1 SubCategory'] = 'Location'
df_production.loc[production['Role 1'].str.contains('Line|Researcher|Production Manager|Secretary|Accountant|Payroll|Production Co-ordinator|Production Co-Ordinator|Office|Publicist'), 'Role 1 SubCategory'] = 'Admin'
df_production.loc[production['Role 1'].str.contains('Driver|Pilot|Transport'), 'Role 1 SubCategory'] = 'Transport'
# plt.style.use("fivethirtyeight")
# plt.rcParams['figure.figsize'] = [12, 6]
breakdown = production['Role 1 SubCategory'].value_counts().array
colors = ['#E49273', '#679436', '#A5BE00']
labels = ['Location', 'Admin', 'Transport']
plt.pie(breakdown, labels=labels, labeldistance=1.2, colors=colors, shadow=True,
startangle=45, autopct='%1.0f%%', wedgeprops={'edgecolor':'black'})
plt.title('Production Sub Categories')
plt.tight_layout()
plt.savefig('Outputs/Department_Stats/production_sub_categories_pie_chart.png')
plt.show()
# Create new df for storing filtered values
structure = {
'Year':[],
'Direction':[],
'Music':[],
'Producer':[],
'Construction':[],
'Costume':[],
'Hair & Make-Up':[],
'Sound':[],
'Post-Production':[],
'Casting':[],
'Support':[],
'Art':[],
'Camera':[],
'Production':[],
'Script':[],
'Special FX':[]
}
df_roles = pd.DataFrame(structure)
for column in df.loc[:, '2020':'1976']:
# filter Role 1 Category for entries of 'Direction'
filt1 = (df[column] == column) & (df['Role 1 Category'] == 'Direction')
# apply filter
direction = df[filt1]
# count items in dataframe oject
direction_count = len(direction.index)
filt2 = (df[column] == column) & (df['Role 1 Category'] == 'Music')
music = df[filt2]
music_count = len(music.index)
filt3 = (df[column] == column) & (df['Role 1 Category'] == 'Producer')
producer = df[filt3]
producer_count = len(producer.index)
filt4 = (df[column] == column) & (df['Role 1 Category'] == 'Construction')
construction = df[filt4]
construction_count = len(construction.index)
filt5 = (df[column] == column) & (df['Role 1 Category'] == 'Costume')
costume = df[filt5]
costume_count = len(costume.index)
filt6 = (df[column] == column) & (df['Role 1 Category'] == 'Hair & Make-Up')
hair = df[filt6]
hair_count = len(hair.index)
filt7 = (df[column] == column) & (df['Role 1 Category'] == 'Sound')
sound = df[filt7]
sound_count = len(sound.index)
filt8 = (df[column] == column) & (df['Role 1 Category'] == 'Post-Production')
postp = df[filt8]
postp_count = len(postp.index)
filt9 = (df[column] == column) & (df['Role 1 Category'] == 'Casting')
casting = df[filt9]
casting_count = len(casting.index)
filt10 = (df[column] == column) & (df['Role 1 Category'] == 'Support')
support = df[filt10]
support_count = len(support.index)
filt11 = (df[column] == column) & (df['Role 1 Category'] == 'Art')
art = df[filt11]
art_count = len(art.index)
filt12 = (df[column] == column) & (df['Role 1 Category'] == 'Camera')
camera = df[filt12]
camera_count = len(camera.index)
filt13 = (df[column] == column) & (df['Role 1 Category'] == 'Production')
production = df[filt13]
production_count = len(production.index)
filt14 = (df[column] == column) & (df['Role 1 Category'] == 'Script')
script = df[filt14]
script_count = len(script.index)
filt15 = (df[column] == column) & (df['Role 1 Category'] == 'Special FX')
special = df[filt15]
special_count = len(special.index)
df_roles = df_roles.append({
'Year': column,
'Direction': direction_count,
'Music': music_count,
'Producer': producer_count,
'Construction': construction_count,
'Costume': costume_count,
'Hair & Make-Up': hair_count,
'Sound': sound_count,
'Post-Production': postp_count,
'Casting': casting_count,
'Support': support_count,
'Art': art_count,
'Camera': camera_count,
'Production': production_count,
'Script': script_count,
'Special FX': special_count
}, ignore_index=True)
#view output
df_roles
## Plot Setup
#colors = ['#064789', '#427AA1', '#E49273', '#679436', '#A5BE00', '#F7567C', '#243E36', '#A52422', '#E28413', '#2E0014', '#4E0250', '#F24333', '#345E56']
colors = ['red', 'green', 'magenta', 'orange','blue', 'pink','purple', 'yellow','cyan', 'lime', 'brown', 'grey', 'black']
years = df_roles['Year']
producers = df_roles['Producer']
construction = df_roles['Construction']
costume = df_roles['Costume']
hairs = df_roles['Hair & Make-Up']
sound = df_roles['Sound']
postp = df_roles['Post-Production']
casting = df_roles['Casting']
support = df_roles['Support']
art = df_roles['Art']
camera = df_roles['Camera']
production = df_roles['Production']
script = df_roles['Script']
special = df_roles['Special FX']
x_indexes = list(range(len(years)))
x_indexes.reverse()
plt.plot(x_indexes, producers, color=colors[0], label='Producers')
plt.plot(x_indexes, construction, color=colors[12], label='Construction')
plt.plot(x_indexes, costume, color=colors[1], label='Costume')
plt.plot(x_indexes, hairs, color=colors[2], label='Hair & Make-Up')
plt.plot(x_indexes, sound, color=colors[3], label='Sound')
plt.plot(x_indexes, postp, color=colors[4], label='Post-Production')
plt.plot(x_indexes, casting, color=colors[5], label='Casting')
plt.plot(x_indexes, support, color=colors[6], label='Support')
plt.plot(x_indexes, art, color=colors[7], label='Art')
plt.plot(x_indexes, camera, color=colors[8], label='Camera')
plt.plot(x_indexes, production, color=colors[9], label='Production')
plt.plot(x_indexes, script, color=colors[10], label='Script')
plt.plot(x_indexes, special, color=colors[11], label='Special FX')
plt.xlabel('Years')
plt.ylabel('No. Workers')
plt.title('Number in Department by Year')
plt.xticks(ticks=x_indexes, labels=years, rotation=90)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Department_Growth/number_in_dept_by_year.png')
plt.show()
plt.plot(x_indexes, camera, color=colors[8], label='Camera')
plt.plot(x_indexes, production, color=colors[9], label='Production')
plt.plot(x_indexes, art, color=colors[7], label='Art')
plt.plot(x_indexes, producers, color=colors[0], label='Producers')
plt.plot(x_indexes, postp, color=colors[4], label='Post-Production')
plt.plot(x_indexes, hairs, color=colors[2], label='Hair & Make-Up')
plt.plot(x_indexes, sound, color=colors[3], label='Sound')
plt.xlabel('Years')
plt.ylabel('No. Workers')
plt.title('Departments with Higher Growth')
plt.xticks(ticks=x_indexes, labels=years, rotation=90)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Department_Growth/departments_higher_growth.png')
plt.show()
plt.plot(x_indexes, construction, color=colors[12], label='Construction')
plt.plot(x_indexes, costume, color=colors[1], label='Costume')
plt.plot(x_indexes, casting, color=colors[5], label='Casting')
plt.plot(x_indexes, support, color=colors[6], label='Support')
plt.plot(x_indexes, script, color=colors[10], label='Script')
plt.plot(x_indexes, special, color=colors[11], label='Special FX')
plt.xlabel('Years')
plt.ylabel('No. Workers')
plt.title('Departments with Lower Growth')
plt.xticks(ticks=x_indexes, labels=years, rotation=90)
plt.legend()
plt.grid(True)
plt.tight_layout()
#plt.savefig('Outputs/Department_Growth/lower_growth_depts.png')
plt.show()
The dataset is compromised in terms of location data. We take the last known postcode of entries that include a postcode and check the postcode against a list of rural postcodes.
# Where no postcode is given
df['Postcode1'].isna().value_counts()
# Number of Entries with Rural Postcode
df['Rural'].value_counts()
# Entries with Urban Postcode / Entries with Rural Postcode
filt = df['Postcode1'].notna()
postcodes = df[filt]
postcodes['Rural'].value_counts(dropna=False)
# Percentage of Urban to Rural Postcodes
filt = df['Postcode1'].notna()
postcodes = df[filt]
postcodes['Rural'].value_counts(normalize=True, dropna=False).apply(lambda x: format((x*100), '.0f'))
# Percentage of all postcode values inc nan to Rural Postcodes
df['Rural'].value_counts(normalize=True, dropna=False).apply(lambda x: format((x*100), '.0f'))
Data for location is inconsistent and not tracked accurately over time. We've reduced this to a look at what is effectively a 'last known postcode area' for each entry.
df['Postcode1'].value_counts().head(20)
# Trainee data float to string
df['Trainee prog'] = df['Trainee prog'].fillna(-1)
df['Trainee prog'] = df['Trainee prog'].astype(int)
df['Trainee prog'] = df['Trainee prog'].astype(str)
df['Trainee prog'] = df['Trainee prog'].replace('-1', np.nan)
df['Trainee prog'].count()
# Show number of trainees on Trainee Program in a given year
df['Trainee prog'].value_counts()
print('Average number of trainees over years the programs ran: ' + str(df['Trainee prog'].value_counts().median())
+ ' trainees')
# Filter Department for 'Trainee', return number of years in directory for each
filt = df['Role 1 Category'] == 'Trainee'
trainees = df[filt]
trainees['No of Yrs']
years = df.groupby(['Trainee prog'])
years['No of Yrs'].value_counts()
trainees = df[['Trainee prog', 'No of Yrs', 'Gender', 'Role 1 Category']].dropna()
trainees = trainees.sort_values('Trainee prog')
trainees
trainees['Gender'].value_counts()
trainees['Gender'].value_counts(normalize=True).apply(lambda x: format((x*100), '.0f'))
dept_grp = trainees.groupby(['Role 1 Category'])
dept_grp['Gender'].value_counts()
dept_grp = trainees.groupby(['Role 1 Category'])
dept_grp['Gender'].value_counts(normalize=True)
year_grp = trainees.groupby(['Trainee prog'])
year_grp['Role 1 Category'].value_counts()
# Data
trainee_years = trainees['Trainee prog'].unique()
trainee_numbers = trainees.groupby(['Trainee prog'])
trainee_numbers = trainee_numbers['Trainee prog'].value_counts().array
# Plot
width=0.7
plt.bar(trainee_years, trainee_numbers, color='magenta', width = width, label='Trainees')
plt.xlabel('Year of Traineeship')
plt.ylabel('No. of Trainees')
plt.title('Trainees')
plt.xticks(ticks=trainee_years, rotation=90)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Trainees/trainees_per_year.png')
plt.show()
Basic figures for Career Lengths
# Number of Years in Directory, number of entries present for that duration
df['No of Yrs'].value_counts()
Career length grouped into 5 year bins
bins = pd.cut(df['No of Yrs'], [0, 5, 10, 15, 20, 25, 30, 35, 40, 45])
df.groupby(bins)['No of Yrs'].agg(['count'])
# Data
indexes = ['1 year','2-3 years', '4-6 years', '7-20 years', '20-41 years']
bins = pd.cut(df['No of Yrs'], [0, 1, 3, 6, 20, 41])
values = df.groupby(bins)['No of Yrs'].agg(['count'])
print(values)
# Simplify Data
simple_values = (values.values)
value_list = [ item for elem in simple_values for item in elem]
# Plot
colors = sns.cubehelix_palette(start=2, rot=1, dark=0.5)
#colors = ['blue', 'magenta', 'orange', 'green', 'lime', 'cyan', 'yellow']
plt.pie(value_list, labels=indexes, labeldistance=1.2, shadow=True,
startangle=90, autopct='%1.0f%%', colors=colors, wedgeprops={'edgecolor':'black'})
plt.title('% of entries with career of given length')
plt.tight_layout()
plt.savefig('Outputs/Longevity_Pie/longevity_more_bins_percentages.png')
plt.show()
filtered = df['No of Yrs'].apply(lambda x: x > 1)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(lambda x: format((x*100), '.0f')))
filtered = df['No of Yrs'].apply(lambda x: x > 6)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(lambda x: format((x*100), '.0f')))
filtered = df['No of Yrs'].apply(lambda x: x > 15)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(lambda x: format((x*100), '.0f')))
filtered = df['No of Yrs'].apply(lambda x: x > 20)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(lambda x: format((x*100), '.0f')))
Median Career Length All Entries
df['No of Yrs'].median()
Median Career Length for Entries in for more than 1 Year
filt = (df['No of Yrs'] > 1)
df2 = df[filt]
df2['No of Yrs'].median()
# Percentages
print('No Yrs Percentage')
print(df['No of Yrs'].value_counts(normalize=True).apply(lambda x: format((x*100), '.0f')))
# Showing Long Careers & Breakdown by Department
lengths = df.groupby(['No of Yrs'])
lengths['Role 1 Category'].value_counts().tail(107)
# Showing Short Careers Breakdown by Department Numbers
lengths = df.groupby(['No of Yrs'])
lengths['Role 1 Category'].value_counts().head(104)
The following two charts show Longevity in the directory with frequency of entries with the given 'career' lengths plotted along the x axis. We chart the pattern for all entries, and for entries who are present for more than 1 year. While entries lasting only 1 year are at first glance an anomaly, being by far the biggest single group in the dataset, but declaring the least information given their short presence, we see the same pattern (scaling down) with the removal of those entries from the chart.
# Data
keys_list = df['No of Yrs'].value_counts().index.tolist()
values = df['No of Yrs'].value_counts().values
median = df['No of Yrs'].median()
# Plot
width=0.8
color = sns.cubehelix_palette(start=3, dark=0.5)
plt.bar(keys_list, values, color=color, width = width, label='Crew')
plt.axvline(median, linewidth=2, color='blue', label='Median')
plt.xlabel('Years')
plt.ylabel('Number of Workers')
plt.title('Basic Longevity - Showing All Entries')
plt.xticks(ticks=keys_list)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Crew_Longevity_Bar_Charts/basic_crew_longevity.png')
plt.show()
# Data
filt = (df['No of Yrs'] > 1)
df2 = df[filt]
keys_list = df2['No of Yrs'].value_counts().index.tolist()
values = df2['No of Yrs'].value_counts().values
median = df2['No of Yrs'].median()
# Plot
width=0.8
color = sns.cubehelix_palette(start=1.5, dark=0.5)
plt.bar(keys_list, values, color=color, width = width, label='Crew')
plt.axvline(median, linewidth=2, color='blue', label='Median')
plt.xlabel('Years')
plt.ylabel('Number of Workers')
plt.title('Basic Longevity > 1 Year')
plt.xticks(ticks=keys_list)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Crew_Longevity_Bar_Charts/basic_crew_longevity>1yr.png')
plt.show()
We are interested in the factors that allow people to develop longer careers. The following chart shows frequency of career lengths in the entries for between 4 and 20 years. This offers a wide sample of entries who are clearly getting something out of being in the directory
filt2 = (df['No of Yrs'] > 3 ) & (df['No of Yrs'] < 21)
df3 = df[filt2]
out = df3['No of Yrs'].value_counts()
keys_list = out.index.tolist()
values = df3['No of Yrs'].value_counts().values
median = df3['No of Yrs'].median()
color = sns.cubehelix_palette(start=2, dark=0.5)
width=0.8
plt.bar(keys_list, values, color=color, width = width, label='Crew')
plt.axvline(median, linewidth=2, color='blue', label='Median')
plt.xlabel('Years Presence in Film Bang')
plt.ylabel('No. of Crew')
plt.title('Longevity: Presence in FB 4-20 years')
plt.xticks(ticks=keys_list)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Crew_Longevity_Bar_Charts/basic_crew_longevity_4_20_yrs.png')
plt.show()
filt_1 = (df['No of Yrs'] < 2)
df_1 = df[filt_1]
msg = 'Number in department in for only 1 year, historical'
print(msg)
print(len(msg) *'-')
entries = []
for dept in depts:
filt_dept = (df_1['Role 1 Category'] == dept)
df_depts = df_1[filt_dept]
count = df_depts['No of Yrs'].value_counts().values
entries.append(count[0])
print(dept, count)
print(entries)
width=0.7
color = sns.cubehelix_palette(start=2, dark=0.5)
plt.bar(depts, entries, color=color, width = width, label='Crew')
plt.xlabel('Departments')
plt.ylabel('No. of Crew')
plt.title('In for 1 Year')
plt.xticks(ticks=depts, rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Crew_Longevity_Bar_Charts/crew_in_for_1_year.png')
plt.show()
# Data
filt5 = (df['No of Yrs'] > 20 )
df4 = df[filt5]
keys_list = df4['No of Yrs'].value_counts().index.tolist()
values = df4['No of Yrs'].value_counts().values
# Plot
color = sns.cubehelix_palette(start=1, dark=0.5)
width=0.7
plt.bar(keys_list, values, color=color, width = width, label='Crew')
plt.xlabel('Years Presence in Film Bang')
plt.ylabel('No. of Crew')
plt.title('Longevity: Presence in FB over 20 years')
plt.xticks(ticks=keys_list)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Crew_Longevity_Bar_Charts/basic_crew_longevity_>20_yrs.png')
plt.show()
import seaborn as sns
import matplotlib.pylab as plt
totals = df['Role 1 Category'].value_counts().array
depts = df['Role 1 Category'].value_counts().index.array
indexes = ['1 year','2-3 years', '4-6 years', '7-11 years', '12-18 years', '19-29 years', '30-42 years']
for index, dept in enumerate(depts):
# set chart colour based on index
color = sns.cubehelix_palette(start=index, dark=0.5)
# filter for dept in depts
filt_dept = (df['Role 1 Category'] == dept)
df_depts = df[filt_dept]
# set bins for aggregation
bins = pd.cut(df_depts['No of Yrs'], [0, 1, 3, 6, 11, 18, 29, 42])
values = df_depts.groupby(bins)['No of Yrs'].agg(['count'])
# unpack the data
simple_values = (values.values)
value_list = [ item for elem in simple_values for item in elem]
# calculate median career length for given department
median = str(df_depts['No of Yrs'].median())
# Plot
width = 0.7
total = str(totals[index])
label = 'Dept total = '+total+', \n Median= '+median+' years'
plt.bar(indexes, value_list, color=color, width = width, label=label)
plt.xlabel('Years Presence in Film Bang')
plt.ylabel('Number of Entries')
plt.title('Career lengths in ' + dept + ' dept. as of 2020')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Crew_Longevity_Bar_Charts/crew_longevity_by_year_prime_bins_bar_'+dept+'.png')
plt.show()
# do bins of years:
# breakdown by department for 30 - 41 years
# breakdown by department for 19-29 years etc
Using the calculation for career length each year (a count produced for each year an entry is present in the directory) we chart career lengths over time
years = [*range(1,42)]
#initialise a temporary dataframe
df_master = pd.DataFrame({'Years': years})
# loop through year columns and filter out NAN values
for year in df.loc[:, '2020':'1976']:
filtyear = df[year].notna()
dfyear = df[filtyear]
# get numbers
values_year = dfyear['Yr Cnt '+year].value_counts().array
keys_year = dfyear['Yr Cnt '+year].value_counts().index
# put them in mini temp dataframe
df_year = pd.DataFrame({f'{year} Keys':keys_year, f'{year} Values':values_year})
# join the mini dataframes to the temporary dataframe
df_master = df_master.join(df_year.set_index(f'{year} Keys'), on='Years')
data = df_master.set_index('Years')
# Plot setup
import seaborn as sns
import matplotlib.pylab as plt
plot_years = df.loc[:, '2020':'1976']
colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)
plt.figure(figsize=(22,10))
ax = sns.heatmap(data, linewidth=0.3, cmap=colours, annot=True, fmt=".0f")
plt.yticks(rotation=0)
ax.xaxis.set_ticks_position('top')
ax.set_xticklabels(plot_years,rotation=90)
plt.xlabel('Years')
plt.ylabel('Length of Career in Years')
plt.title(f'Careers in Directory Over Time')
plt.savefig(f"Outputs/Heatmaps/career_longevity_basic_1.png")
plt.show()
depts = df['Role 1 Category'].value_counts().index.array
df_master = pd.DataFrame({'Years': years})
import seaborn as sns
import matplotlib.pylab as plt
plot_years = df.loc[:, '2020':'1976']
def plot(dept):
colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)
plt.figure(figsize=(22,10))
ax = sns.heatmap(data, linewidth=0.3, cmap=colours, annot=True, fmt=".0f")
plt.yticks(rotation=0)
ax.xaxis.set_ticks_position('top')
ax.set_xticklabels(plot_years,rotation=90)
plt.xlabel('Years')
plt.ylabel('Length of Career in Years')
plt.title(f'Careers in {dept} Department Over Time')
plt.savefig(f"Outputs/Heatmaps/career_longevity_{dept}_2.png")
plt.show()
# Loop through departments
for i in depts:
#initialise a new df_master dataframe for each pass through departments
years = [*range(1,42)]
df_master = pd.DataFrame({'Years': years})
# filter for department of given iteration
deptfilt = (df['Role 1 Category'] == i)
df_dept = df[deptfilt]
# loop through year columns and filter out NAN values
for year in df.loc[:, '2020':'1976']:
filtyear = df_dept[year].notna()
dfyear = df_dept[filtyear]
# get numbers
values_year = dfyear['Yr Cnt '+year].value_counts().array
keys_year = dfyear['Yr Cnt '+year].value_counts().index
# put them in a new mini dataframe
df_year = pd.DataFrame({f'{year} Keys':keys_year, f'{year} Values':values_year})
# join the mini dataframes to a temporary master dataframe
df_master = df_master.join(df_year.set_index(f'{year} Keys'), on='Years')
# assign the new master dataframe to data variable and plot it
# plot function will expect data to be the current dataframe
df_master = df_master.set_index('Years')
data = df_master
plot(i)
Gender is inferred. The Film Bang directory does not ask people to declare gender, or other personal information. The gender presented here is inferred from the names of entries (done during data capture, prior to anonymization of the data). Where gender is recorded here as 'unknown' it refers to a name that might conventionally refer to both a man or woman.
df['Gender'].value_counts()
df['Gender'].value_counts(normalize=True).apply(lambda x: format((x*100), '.0f'))
role_grp = df.groupby(['Role 1 Category'])
role_grp['Gender'].value_counts().tail(300)
# gender = []
# for column in df.loc[:, '2020':'1976']:
# yr_gp = df.groupby([column])
# year_data = yr_gp['Gender'].value_counts()
# gender.append(year_data)
# print(gender)
# df_g = pd.DataFrame(gender)
# print(df_g)
ct = {
'Year':[],
'Male':[],
'Female':[],
'Unknown':[],
}
gender_ct = pd.DataFrame(ct)
for column in df.loc[:, '2020':'1976']:
filt1 = (df[column] == column) & (df['Gender'] == 'Female')
new_df = df[filt1]
filt2 = (df[column] == column) & (df['Gender'] == 'Male')
new_df2 = df[filt2]
filt3 = (df[column] == column) & (df['Gender'] == 'Unknown')
new_df3 = df[filt3]
women = len(new_df.index)
men = len(new_df2.index)
no_gender = len(new_df3.index)
gender_ct = gender_ct.append({'Year': column, 'Male': men,'Female': women, 'Unknown': no_gender}, ignore_index=True)
data = gender_ct
years_x = data['Year']
no_g_y = data['Unknown']
male_y = data['Male']
female_y = data['Female']
bars = np.add(male_y, female_y).tolist()
plt.bar(years_x, no_g_y, bottom=bars, color='orange', width=width, label='Gender Unknown')
plt.bar(years_x, male_y, bottom=female_y, color='green', width=width, label='Male')
plt.bar(years_x, female_y, color='blue', width=width, label='Female')
plt.xlabel('Years')
plt.ylabel('No. Entries')
plt.title('Entire Directory by Gender')
plt.xticks(ticks=years_x, rotation=90)
plt.gca().invert_xaxis()
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("Outputs/Crew_Gender/basic_gender_chart.png")
plt.show()
# Numerical data on gender by year
print(gender_ct)
# Refactor this to break into smaller parts: data manipulation and plots.
# Create plot function with a dept parameter
# Call plot function within the loop, passing the dept / column variable as argument.
width = 0.7
gender_by_year = {
'Year':[],
'Male':[],
'Female':[],
'Unknown':[],
}
df_gender = pd.DataFrame(gender_by_year)
depts = df['Role 1 Category'].dropna().unique()
depts
for i in depts:
df_gender = df_gender[0:0]
filt = (df['Role 1 Category'] == i)
department = df[filt]
for column in department.loc[:, '2020':'1976']:
filt1 = (department[column] == column) & (department['Gender'] == 'Female')
new_df = department[filt1]
filt2 = (department[column] == column) & (department['Gender'] == 'Male')
new_df2 = department[filt2]
filt3 = (department[column] == column) & (department['Gender'] == 'Unknown')
new_df3 = department[filt3]
women = len(new_df.index)
men = len(new_df2.index)
no_gender = len(new_df3.index)
df_gender = df_gender.append({'Year': column, 'Male': men,'Female': women, 'Unknown': no_gender}, ignore_index=True)
data = df_gender
years_x = data['Year']
no_g_y = data['Unknown']
male_y = data['Male']
female_y = data['Female']
bars = np.add(male_y, female_y).tolist()
plt.bar(years_x, no_g_y, bottom=bars, color='#427AA1', width=width, label='Unknown')
plt.bar(years_x, male_y, bottom=female_y, color='#679436', width=width, label='Male')
plt.bar(years_x, female_y, color='#E28413', width=width, label='Female')
plt.xlabel('Years')
plt.ylabel('No. Workers')
plt.title(f'{i} Department Workers by Gender')
plt.xticks(ticks=years_x, rotation=90)
plt.gca().invert_xaxis()
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(f"Outputs/Departments_Gender/{i}_department_workers_by_gender.png")
plt.show()
g = df.groupby(['No of Yrs'])
g_count = g['Gender'].value_counts()
print(g_count)
# Create Dataframe with Yrs column
df_clg = pd.DataFrame({'Yrs': range(1, 43)})
# Filtering
m_filt = (df['Gender'] == 'Male')
df_m = df[m_filt]
f_filt = (df['Gender'] == 'Female')
df_f = df[f_filt]
u_filt = (df['Gender'] == 'Unknown')
df_u = df[u_filt]
# parse data
m_length = df_m.groupby(['No of Yrs'])
m_c_length_values = m_length['Gender'].value_counts().array
m_c_length_keys = m_length['Gender'].value_counts().index.tolist()
m_indexes = [i[0] for i in m_c_length_keys]
f_length = df_f.groupby(['No of Yrs'])
f_c_length_values = f_length['Gender'].value_counts().array
f_c_length_keys = f_length['Gender'].value_counts().index.tolist()
f_indexes = [i[0] for i in f_c_length_keys]
u_length = df_u.groupby(['No of Yrs'])
u_c_length_values = u_length['Gender'].value_counts().array
u_c_length_keys = u_length['Gender'].value_counts().index.tolist()
u_indexes = [i[0] for i in u_c_length_keys]
# Join Data
df_male = pd.DataFrame({'M Yrs':m_indexes, 'M Values':m_c_length_values})
df_clg = df_clg.join(df_male.set_index('M Yrs'), on='Yrs')
df_female = pd.DataFrame({'F Yrs':f_indexes, 'F Values':f_c_length_values})
df_clg = df_clg.join(df_female.set_index('F Yrs'), on='Yrs')
df_unknown = pd.DataFrame({'U Yrs':u_indexes, 'U Values':u_c_length_values})
df_clg = df_clg.join(df_unknown.set_index('U Yrs'), on='Yrs')
print(df_clg)
width = 0.3
yrs = df_clg['Yrs']
male = df_clg['M Values']
female = df_clg['F Values']
unknown = df_clg['U Values']
#bars = np.add(male_y, female_y).tolist()
plt.bar(yrs + width, unknown, color='orange', width=width, label='Gender Unknown', log=True)
plt.bar(yrs, male, color='green', width=width, label='Male',log=True)
plt.bar(yrs - width, female, color='blue', width=width, label='Female',log=True)
plt.xlabel('Career Length in Years')
plt.ylabel('No. Entries - Log Scale')
plt.title('Career Length by Gender')
plt.xticks(ticks=yrs, rotation=90)
#plt.gca().invert_xaxis()
plt.legend(loc='upper right')
plt.grid(True)
plt.tight_layout()
plt.savefig("Outputs/Crew_Gender/gender_longevity_log.png")
plt.show()
bins = pd.cut(df_clg['Yrs'], [0, 1, 3, 6, 11, 18, 29, 42])
m_values = male.groupby(bins)
f_values = female.groupby(bins)
u_values = unknown.groupby(bins)
#print(m_values)
m_out = [i for i in m_values]
f_out = [i for i in f_values]
u_out = [i for i in u_values]
# Data
m_totals = []
f_totals = []
u_totals = []
for i in m_out:
m_totals.append(i[1].sum())
for i in f_out:
f_totals.append(i[1].sum())
for i in u_out:
u_totals.append(i[1].sum())
# Refactor this to OOP plot (fig, axs) in order to overlay the 3 plots as subplots (allowing different bars in each bin)
# Plot
indexes = ['1 year','2-3 years', '4-6 years', '7-11 years', '12-18 years', '19-29 years', '30-42 years']
width = 0.3
#total = str(totals[index])
plt.bar(indexes, m_totals, color='orange', width=width, align='edge', label='Male')
plt.bar(indexes, f_totals, color='green', width=-width, align='edge', label='Female')
plt.bar(indexes, u_totals, color='blue', width=width, label='Unknown Gender')
plt.xlabel('Year Bins')
plt.ylabel('Number of Entries')
plt.title('Career Length Bins')
plt.legend()
#plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Crew_Gender/longevity_gender_crew_bins.png')
plt.show()
group by department, longevity